Michael Collins NLP Homework 4
课程主页:http://www.cs.columbia.edu/~cs4705/
课程网盘地址:
链接:https://pan.baidu.com/s/1KijgO7yjL_MVCC9zKZ7Jdg
提取码:t1i3
这一次回顾Michael Collins NLP作业4。
Quesion 1
(a)
令上式为$0$可得
注意$f_1=f_2$,所以
(b)
令上式为$0$可得
取$j=1,2$可得
构造集合
那么上式等价于
由于上式对$j=1,2$成立,这说明$v_1,v_2$同号。
Question 2
假设
那么
令上式为$0$可得
Question 3
(a)
定义
所以需要两个参数$v_1, v_2$。
(b)
(c)
令
那么
注意到
所以条件为
Question 4
由于一些函数要共用,所以编写了helper.py文件
import tagger_config
from subprocess import PIPE
import sys, subprocess
tags = tagger_config.tags
def sentence_reader(filename):
sentences = []
with open(filename) as f:
sentence = [('*', '*')]
sentence = []
for word in f.readlines():
w = word.strip().split()
#非空
if not w:
sentences.append(sentence)
sentence = [('*', '*')]
sentence = []
else:
sentence.append(w)
return sentences
def transform(sentence):
res = ""
n = len(sentence)
for i in range(n):
word = sentence[i]
m = len(word)
tmp = word[0]
#单词之间以\t间隔
for j in range(1, m):
tmp += "\t" + word[j]
res += tmp
#除了最后一行增加换行
if (i < n - 1):
res += "\n"
return res
def process(args):
"Create a 'server' to send commands to."
return subprocess.Popen(args, stdin=PIPE, stdout=PIPE)
def call(process, stdin):
"Send command to a server and get stdout."
res = []
process.stdin.write(stdin + "\n\n")
line = process.stdout.readline().strip()
while line:
res.append(line)
line = process.stdout.readline().strip()
return res
def get_feature(sentence, his):
#his=[i, tag[i-1], tag[i]]
#BIGRAM
BIGRAM = "BIGRAM:" + his[1] + ":" + his[2]
#TAG
i = int(his[0]) - 1
TAG = "TAG:" + sentence[i][0] + ":" + his[2]
return BIGRAM, TAG
def get_feature_v1(sentence, his):
#his=[i, tag[i-1], tag[i]]
res = []
#BIGRAM
BIGRAM = "BIGRAM:" + his[1] + ":" + his[2]
if (his[1] == " "):
print(his)
res.append(BIGRAM)
#TAG
i = int(his[0]) - 1
TAG = "TAG:" + sentence[i][0] + ":" + his[2]
res.append(TAG)
#SUFF
for j in range(len(sentence)):
word = sentence[j][0]
n = len(word)
for k in range(1, 4):
if k <= n:
tmp = "SUFF:" + word[-k:] + ":" + str(k) + ":" + his[2]
res.append(tmp)
return res
Q4
from helper import *
#读取
def get_value(filename):
value = dict()
with open(filename) as f:
for string in f.readlines():
fea, v = string.strip().split()
value[fea] = float(v)
return value
def generate(output, sentences, F):
with open(output, "wb") as f:
for sentence in sentences:
sent = transform(sentence)
history = call(enum_server, sent)
score_ = []
for his in history:
score = 0
feature = F(sentence, his.split())
for fea in feature:
if fea in value:
score += value[fea]
score_.append(his + "\t" + str(score))
score_ = '\n'.join(score_)
#生成结果
res = call(decoder_server, score_)
#保存
n = len(sentence)
for i in range(n):
tmp = sentence[i][0] + "\t" + res[i].split()[-1]
f.writelines(tmp)
f.write("\n")
f.write("\n")
enum_server = process(["python", "tagger_history_generator.py", "ENUM"])
decoder_server = process(["python", "tagger_decoder.py", "HISTORY"])
filename = "tag_dev.dat"
sentences = sentence_reader(filename)
#Q4
f1 = "tag.model"
o1 = "Q4.out"
value = get_value(f1)
generate(o1, sentences, get_feature)
2226 2459 0.905246034974
Question 5
Q5
#参考https://github.com/huxiuhan/nlp-hw
from helper import *
filename = "tag.model"
value = dict()
enum_server = process(["python", "tagger_history_generator.py", "ENUM"])
gold_server = process(["python", "tagger_history_generator.py", "GOLD"])
decoder_server = process(["python", "tagger_decoder.py", "HISTORY"])
filename = "tag_train.dat"
sentences = sentence_reader(filename)
K = 5
#历史
History = []
History_label = []
for sentence in sentences:
sent = transform(sentence)
history = call(enum_server, sent)
history_label = call(gold_server, sent)
History.append(history)
History_label.append(history_label)
N = len(sentences)
#训练
for k in range(K):
for i, sentence in enumerate(sentences):
sent = transform(sentence)
history = History[i]
#真实结果
history_label = History_label[i]
score_ = []
for his in history:
score = 0
feature = get_feature_v1(sentence, his.split())
for fea in feature:
if fea in value:
score += value[fea]
score_.append(his + "\t" + str(score))
score_ = '\n'.join(score_)
#生成结果
res = call(decoder_server, score_)
#比较结果
flag = True
n = len(history_label)
for j in range(n):
a1 = res[j][-1]
a2 = history_label[j].split()[-1]
if a1 != a2:
#不相同
for f in get_feature_v1(sentence, res[j].split()):
if f in value:
value[f] -= 1
else:
value[f] = -1
#相同
for f in get_feature_v1(sentence, history_label[j].split()):
if f in value:
value[f] += 1
else:
value[f] = 1
#生成结果
outputname = "Q5.model"
with open(outputname, "wb") as f:
for fea in value:
f.writelines(fea + " " + str(value[fea]))
f.writelines("\n")
2184 2459 0.888165921106
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Doraemonzzz!
评论
ValineLivere